For more information about experimental design, talk to Dr. Danielle Lemay and Zeynep Alkan at the USDA-ARS WHNRC in Davis, CA.
data_dir = "/Users/chad.masarweh/Documents/Transwell"
data_dir2 = "/Users/chad.masarweh/Documents"
data_dir3= "/Users/chad.masarweh/Documents/FL100_SCFA/input_data"
library(ggplot2)
library(gridExtra)
library(grid)
library(rlang)
library("readxl")
library(stringr)
library(car)
library(ggResidpanel)
library(patchwork)
library(broom)
library(glue)
library(cowplot)
library(kableExtra)
library(tibble)
library(dplyr)
library(tidyr)
library(purrr)
library(readr)
library(conflicted)
library(moments)
conflict_prefer("select", "dplyr")
conflict_prefer("filter", "dplyr")
This is an unmodified head() view of the data I
received. It is sorted by “experiment.date”, then “treatment”. The
columns “HBSS.ave” and “initial.TEER.OK” are not tidy and need to be
tidy’d.
# Load the Excel file
rawdata <- read_excel(file.path(data_dir, "experiment_sort_no_changes.xlsx"))
# Display only the first 20 rows
knitr::kable(head(rawdata))
| experiment.date | treatment | %TEER.change.raw | HBSS.ave | IL8 | food.particulate | oily | initial.TEER.OK | TW.day | TW.lot | protein.lysate | cell.passage | repeat.status |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2018-03-13 | HBSS | -2.941177 | -1.470588 | 13.9600 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | HBSS | 0.000000 | NA | 10.4885 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | LPS .TNFa | -24.767802 | NA | 117.3300 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | 7011 | 13.664596 | NA | 29.1005 | no | NA | yes | 14 | NA | no | 20 | repeated |
| 2018-03-13 | 7011 | 14.461538 | NA | 36.6470 | no | NA | yes | 14 | NA | no | 20 | repeated |
| 2018-03-13 | 7056 | 25.304878 | NA | 25.6035 | yes | yes | yes | 14 | NA | no | 20 | repeated (3 exp) |
knitr::kable(tail(rawdata))
| experiment.date | treatment | %TEER.change.raw | HBSS.ave | IL8 | food.particulate | oily | initial.TEER.OK | TW.day | TW.lot | protein.lysate | cell.passage | repeat.status |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2024-05-14 | 8019 | 12.631579 | NA | 10.7655 | no | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8019 | 10.104530 | NA | 7.7980 | no | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8100 | 41.379310 | NA | 27.0370 | yes | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8100 | 37.288136 | NA | 17.3265 | yes | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8102 | -4.347826 | NA | 4.5165 | no | no | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8102 | -5.144694 | NA | 4.8815 | no | no | yes | 14 | 9722026 | yes | 22 | repeated |
# Create a list object for read_csv(...,col_types = ,...) and setNames() to define the column types and name them, respectively, because argument col_types requires a list
data_cols <- cols(
experiment = col_date(format = "%Y-%m-%d"),
treatment = col_character(),
percTEERdelta = col_double(),
percTEERdelta_nctrl_ave = col_double(),
IL8 = col_double(),
chunkypoop = col_character(),
oily = col_character(),
initial.TEER.ok = col_character(),
TWday = col_integer(),
TWlot = col_integer(),
proteinLysate = col_logical(),
cellPassage = col_integer(),
comments = col_character()
)
# Read file, modify column types with col_types, then select the first 13 columns, then change the column names
data <- read_csv(file.path(data_dir, "experiment_sort_no_changes.csv"), col_types = data_cols) %>%
filter(rowSums(!is.na(.)) > 0) %>% # importing this data creates four empty rows at the tail
setNames(names(data_cols$cols)) %>%
select(1:13) # Zeynep's data has several empty columns
## New names:
## • `` -> `...14`
## • `` -> `...15`
## • `` -> `...16`
## • `` -> `...17`
## • `` -> `...18`
## • `` -> `...19`
## • `` -> `...20`
## • `` -> `...21`
## • `` -> `...22`
## • `` -> `...23`
## • `` -> `...24`
knitr::kable(head(data))
| experiment | treatment | percTEERdelta | percTEERdelta_nctrl_ave | IL8 | chunkypoop | oily | initial.TEER.ok | TWday | TWlot | proteinLysate | cellPassage | comments |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2018-03-13 | HBSS | -2.94 | -1.47 | 13.96 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | HBSS | 0.00 | NA | 10.49 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | LPS .TNFa | -24.77 | NA | 117.33 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | 7011 | 13.66 | NA | 29.10 | no | NA | yes | 14 | NA | no | 20 | repeated |
| 2018-03-13 | 7011 | 14.46 | NA | 36.65 | no | NA | yes | 14 | NA | no | 20 | repeated |
| 2018-03-13 | 7056 | 25.30 | NA | 25.60 | yes | yes | yes | 14 | NA | no | 20 | repeated (3 exp) |
knitr::kable(tail(data))
| experiment | treatment | percTEERdelta | percTEERdelta_nctrl_ave | IL8 | chunkypoop | oily | initial.TEER.ok | TWday | TWlot | proteinLysate | cellPassage | comments |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2024-05-14 | 8019 | 12.63 | NA | 10.77 | no | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8019 | 10.10 | NA | 7.80 | no | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8100 | 41.38 | NA | 27.04 | yes | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8100 | 37.29 | NA | 17.33 | yes | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8102 | -4.35 | NA | 4.52 | no | no | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8102 | -5.14 | NA | 4.88 | no | no | yes | 14 | 9722026 | yes | 22 | repeated |
# Convert empty values to NA and trim spaces
tidydata <- data %>%
mutate(across(where(is.character), ~na_if(.x, ""))) %>%
mutate(across(where(is.character), str_trim))
knitr::kable(head(tidydata))
| experiment | treatment | percTEERdelta | percTEERdelta_nctrl_ave | IL8 | chunkypoop | oily | initial.TEER.ok | TWday | TWlot | proteinLysate | cellPassage | comments |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2018-03-13 | HBSS | -2.94 | -1.47 | 13.96 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | HBSS | 0.00 | NA | 10.49 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | LPS .TNFa | -24.77 | NA | 117.33 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | 7011 | 13.66 | NA | 29.10 | no | NA | yes | 14 | NA | no | 20 | repeated |
| 2018-03-13 | 7011 | 14.46 | NA | 36.65 | no | NA | yes | 14 | NA | no | 20 | repeated |
| 2018-03-13 | 7056 | 25.30 | NA | 25.60 | yes | yes | yes | 14 | NA | no | 20 | repeated (3 exp) |
knitr::kable(tail(tidydata))
| experiment | treatment | percTEERdelta | percTEERdelta_nctrl_ave | IL8 | chunkypoop | oily | initial.TEER.ok | TWday | TWlot | proteinLysate | cellPassage | comments |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2024-05-14 | 8019 | 12.63 | NA | 10.77 | no | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8019 | 10.10 | NA | 7.80 | no | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8100 | 41.38 | NA | 27.04 | yes | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8100 | 37.29 | NA | 17.33 | yes | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8102 | -4.35 | NA | 4.52 | no | no | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8102 | -5.14 | NA | 4.88 | no | no | yes | 14 | 9722026 | yes | 22 | repeated |
# Replace values in the "treatment" column
tidydata <- tidydata %>%
mutate(
treatment = str_replace_all(treatment, c("HBSS" = "nctrl", "LPS .TNFa" = "pctrl")),
comments = str_replace_all(comments, " ", "_")
)
knitr::kable(head(tidydata))
| experiment | treatment | percTEERdelta | percTEERdelta_nctrl_ave | IL8 | chunkypoop | oily | initial.TEER.ok | TWday | TWlot | proteinLysate | cellPassage | comments |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2018-03-13 | nctrl | -2.94 | -1.47 | 13.96 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | nctrl | 0.00 | NA | 10.49 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | pctrl | -24.77 | NA | 117.33 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | 7011 | 13.66 | NA | 29.10 | no | NA | yes | 14 | NA | no | 20 | repeated |
| 2018-03-13 | 7011 | 14.46 | NA | 36.65 | no | NA | yes | 14 | NA | no | 20 | repeated |
| 2018-03-13 | 7056 | 25.30 | NA | 25.60 | yes | yes | yes | 14 | NA | no | 20 | repeated_(3_exp) |
knitr::kable(tail(tidydata))
| experiment | treatment | percTEERdelta | percTEERdelta_nctrl_ave | IL8 | chunkypoop | oily | initial.TEER.ok | TWday | TWlot | proteinLysate | cellPassage | comments |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2024-05-14 | 8019 | 12.63 | NA | 10.77 | no | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8019 | 10.10 | NA | 7.80 | no | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8100 | 41.38 | NA | 27.04 | yes | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8100 | 37.29 | NA | 17.33 | yes | yes | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8102 | -4.35 | NA | 4.52 | no | no | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8102 | -5.14 | NA | 4.88 | no | no | yes | 14 | 9722026 | yes | 22 | repeated |
tidydata <- tidydata %>%
mutate(oily = case_when(
oily == "oily" ~ 1,
oily == "yes" ~ 1,
oily == "no" ~ 0,
TRUE ~ NA_real_ # Handle any other values as NA
)) %>%
mutate(chunkypoop = case_when(
chunkypoop == "yes" ~ 1,
chunkypoop == "no" ~ 0,
TRUE ~ NA_real_ # Handle any other values as NA
))
knitr::kable(head(tidydata))
| experiment | treatment | percTEERdelta | percTEERdelta_nctrl_ave | IL8 | chunkypoop | oily | initial.TEER.ok | TWday | TWlot | proteinLysate | cellPassage | comments |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2018-03-13 | nctrl | -2.94 | -1.47 | 13.96 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | nctrl | 0.00 | NA | 10.49 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | pctrl | -24.77 | NA | 117.33 | NA | NA | yes | 14 | NA | no | 20 | NA |
| 2018-03-13 | 7011 | 13.66 | NA | 29.10 | 0 | NA | yes | 14 | NA | no | 20 | repeated |
| 2018-03-13 | 7011 | 14.46 | NA | 36.65 | 0 | NA | yes | 14 | NA | no | 20 | repeated |
| 2018-03-13 | 7056 | 25.30 | NA | 25.60 | 1 | 1 | yes | 14 | NA | no | 20 | repeated_(3_exp) |
knitr::kable(tail(tidydata))
| experiment | treatment | percTEERdelta | percTEERdelta_nctrl_ave | IL8 | chunkypoop | oily | initial.TEER.ok | TWday | TWlot | proteinLysate | cellPassage | comments |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2024-05-14 | 8019 | 12.63 | NA | 10.77 | 0 | 1 | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8019 | 10.10 | NA | 7.80 | 0 | 1 | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8100 | 41.38 | NA | 27.04 | 1 | 1 | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8100 | 37.29 | NA | 17.33 | 1 | 1 | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8102 | -4.35 | NA | 4.52 | 0 | 0 | yes | 14 | 9722026 | yes | 22 | repeated |
| 2024-05-14 | 8102 | -5.14 | NA | 4.88 | 0 | 0 | yes | 14 | 9722026 | yes | 22 | repeated |
# Convert initial.TEER.ok to numeric where possible.
## because the column is either "yes" or a number, doing this changes all string values to an NA so that in the next step, we can make new columns based on the numeric value of "initial.TEER.ok"
tidydata <- tidydata %>%
mutate(initial.TEER.ok = suppressWarnings(as.integer(initial.TEER.ok)))
# Create initialTEERrange and initialTEER
tidydata <- tidydata %>%
mutate(
initialTEER = ifelse(is.na(initial.TEER.ok), NA, initial.TEER.ok), # same notation as excel
initialTEERrange = case_when(
is.na(initial.TEER.ok) ~ "ok",
initial.TEER.ok < 250 ~ "lo",
initial.TEER.ok > 350 ~ "hi",
TRUE ~ NA_character_ # Error case
)
)
# Check for invalid cases in initialTEERrange
if (any(is.na(tidydata$initialTEERrange))) {
stop("Error: Some initial.TEER.ok values fall within 250-350, which, according to Zeynep Alkan, is not allowed.")
} else {
tidydata <- tidydata %>% select(-initial.TEER.ok)
}
invisible(lapply(names(tidydata), function(col) {
cat("Column:", col, "\n")
cat(" typeof:", typeof(tidydata[[col]]), "\n")
cat(" class :", class(tidydata[[col]]), "\n\n")
}))
## Column: experiment
## typeof: double
## class : Date
##
## Column: treatment
## typeof: character
## class : character
##
## Column: percTEERdelta
## typeof: double
## class : numeric
##
## Column: percTEERdelta_nctrl_ave
## typeof: double
## class : numeric
##
## Column: IL8
## typeof: double
## class : numeric
##
## Column: chunkypoop
## typeof: double
## class : numeric
##
## Column: oily
## typeof: double
## class : numeric
##
## Column: TWday
## typeof: double
## class : numeric
##
## Column: TWlot
## typeof: double
## class : numeric
##
## Column: proteinLysate
## typeof: character
## class : character
##
## Column: cellPassage
## typeof: double
## class : numeric
##
## Column: comments
## typeof: character
## class : character
##
## Column: initialTEER
## typeof: integer
## class : integer
##
## Column: initialTEERrange
## typeof: character
## class : character
knitr::kable(head(tidydata))
| experiment | treatment | percTEERdelta | percTEERdelta_nctrl_ave | IL8 | chunkypoop | oily | TWday | TWlot | proteinLysate | cellPassage | comments | initialTEER | initialTEERrange |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2018-03-13 | nctrl | -2.94 | -1.47 | 13.96 | NA | NA | 14 | NA | no | 20 | NA | NA | ok |
| 2018-03-13 | nctrl | 0.00 | NA | 10.49 | NA | NA | 14 | NA | no | 20 | NA | NA | ok |
| 2018-03-13 | pctrl | -24.77 | NA | 117.33 | NA | NA | 14 | NA | no | 20 | NA | NA | ok |
| 2018-03-13 | 7011 | 13.66 | NA | 29.10 | 0 | NA | 14 | NA | no | 20 | repeated | NA | ok |
| 2018-03-13 | 7011 | 14.46 | NA | 36.65 | 0 | NA | 14 | NA | no | 20 | repeated | NA | ok |
| 2018-03-13 | 7056 | 25.30 | NA | 25.60 | 1 | 1 | 14 | NA | no | 20 | repeated_(3_exp) | NA | ok |
knitr::kable(tail(tidydata))
| experiment | treatment | percTEERdelta | percTEERdelta_nctrl_ave | IL8 | chunkypoop | oily | TWday | TWlot | proteinLysate | cellPassage | comments | initialTEER | initialTEERrange |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2024-05-14 | 8019 | 12.63 | NA | 10.77 | 0 | 1 | 14 | 9722026 | yes | 22 | repeated | NA | ok |
| 2024-05-14 | 8019 | 10.10 | NA | 7.80 | 0 | 1 | 14 | 9722026 | yes | 22 | repeated | NA | ok |
| 2024-05-14 | 8100 | 41.38 | NA | 27.04 | 1 | 1 | 14 | 9722026 | yes | 22 | repeated | NA | ok |
| 2024-05-14 | 8100 | 37.29 | NA | 17.33 | 1 | 1 | 14 | 9722026 | yes | 22 | repeated | NA | ok |
| 2024-05-14 | 8102 | -4.35 | NA | 4.52 | 0 | 0 | 14 | 9722026 | yes | 22 | repeated | NA | ok |
| 2024-05-14 | 8102 | -5.14 | NA | 4.88 | 0 | 0 | 14 | 9722026 | yes | 22 | repeated | NA | ok |
Due to data access practices at the USDA-ARS WHNRC, metadata was not given to me in one file, and thus has to be joined together.
library(lubridate)
metadata <- read_csv(file.path(data_dir2, "FL100_merged_variables.csv"),
show_col_types = FALSE)
seasonsPlusAlc = full_join(
read_csv(file.path(data_dir2, "FL100_ASA24_Seasons.csv"),
show_col_types = FALSE),
read_csv(file.path(data_dir2, "Chad_FL100_ASA24_Averages.csv"),
show_col_types = FALSE),
by = "UserName") %>%
full_join(read_csv(file.path(data_dir2, "FL100_contraceptive_use.csv"),
show_col_types = FALSE),
by = "UserName") %>%
mutate(avg_season = Avg_season) %>%
select("UserName", "avg_season", "avg_total_ALC", "syst_estro_bcp", "screen_contracept")
metadata <- full_join(metadata, seasonsPlusAlc, by = "UserName") %>%
rename(treatment = UserName) %>%
tidyr::separate(stool_collection, into = c("fecal_date", "fecal_time"), sep = " ") %>%
mutate(fecal_date_parse = mdy(fecal_date)) %>%
mutate(fecal_date_month = month(fecal_date_parse, label = TRUE, abbr = TRUE)) %>%
mutate(flu_season = case_when(is.na(fecal_date_month) ~ NA_real_, fecal_date_month %in% c("Dec", "Jan", "Feb", "Mar") ~ 1, TRUE ~ 0)) %>%
mutate(treatment = as.character(treatment)) %>%
mutate(mg_fbr_p_kcal = avg_total_fiber*1000 / (avg_total_kcal)) %>%
mutate(sex = as.factor(Sex)) %>% dplyr::select(-Sex) %>%
mutate(StoolConsistencyClass = as.factor(StoolConsistencyClass)) %>%
mutate(After24h = as.factor(After24h)) %>%
mutate(across(c(stool_specimen, Age.Category, BMI.Category, edu_level, hhincome),
~ factor(.x, ordered = TRUE))) %>%
mutate(
CRP_BD1mgL = CRP_BD1 / 1000,
CRP_BD3mgL = CRP_BD3 / 1000,
CRP_BD4mgL = CRP_BD4 / 1000,
fecal_mpo_ugg = fecal_mpo / 1000
) %>%
dplyr::select(-CRP_BD1, -CRP_BD3, -CRP_BD4, -fecal_mpo)
45 subjects were assayed more than once. I need to know if categorical variables within these subjects, or even within in-batch duplicates, were consistently documented.
# find which subjects have 2, 3 or 4 biological replicates
for (x in c(2, 3, 4)) {
subject_bioreps <- tidydata %>% # select entire dataset
filter(!treatment %in% c("nctrl", "pctrl")) %>% # filter out controls
group_by(treatment) %>% # tag subject IDs into groups (does not reorder df)
filter(n_distinct(experiment) == x) %>% # filter (ie keep) values where the number of distinct experiments is equal to 3
pull(treatment) %>% # pull out only the treatment column
unique() # filter out duplicates to get only the subjects with 3 biological replicates
cat("Subjects with", x, "biological replicates: \n")
print(subject_bioreps)
cat("\n")
}
## Subjects with 2 biological replicates:
## [1] "7011" "7051" "7054" "7060" "7063" "7066" "7067" "7087" "7105" "7106"
## [11] "6003" "7043" "9035" "8003" "8005" "8019" "9067" "8100" "5046" "7086"
## [21] "7031" "5049" "6071" "7015" "7017" "7033" "8053" "8044" "8083" "8102"
## [31] "8105" "9046" "9033" "9007" "9020" "8009" "8040" "7034" "7055" "7020"
## [41] "7075" "5026"
##
## Subjects with 3 biological replicates:
## [1] "7056" "7079" "6058"
##
## Subjects with 4 biological replicates:
## character(0)
# Check for unmatched values in metadata$treatment
FL100_subjects_nofecalwater <- setdiff(metadata$treatment, data$treatment)
cat("There are ", paste(length(FL100_subjects_nofecalwater), collapse = ", "), " subjects whose fecal waters were not tested\n")
## There are 64 subjects whose fecal waters were not tested
summary_chunk_oily <- tidydata %>% # select entire dataset
filter(!treatment %in% c("nctrl", "pctrl")) %>% # filter out controls
group_by(treatment) %>% # tag subject IDs into groups (does not reorder df)
summarise( # collapse groups into a single row (now only "treatment" column)
chunkypoop_same = n_distinct(chunkypoop) == 1, # add a chunkypoop_same column, defined by the T/F statement
oily_same = n_distinct(oily) == 1,
.groups = "drop"
)
inconsistent_chunkypoop_oily_subjects <- summary_chunk_oily %>%
filter(!chunkypoop_same | !oily_same) # Keep rows where chunkypoop_same or oily_same is FALSE
knitr::kable(inconsistent_chunkypoop_oily_subjects)
| treatment | chunkypoop_same | oily_same |
|---|---|---|
| 7043 | FALSE | TRUE |
| 7051 | FALSE | TRUE |
| 7106 | FALSE | TRUE |
inconsistent_chunkypoop_subjects <- summary_chunk_oily %>%
filter(!chunkypoop_same) %>%
pull(treatment) # pull() transforms the tibble into a vector
inconsistent_oily_subjects <- summary_chunk_oily %>%
filter(!oily_same) %>%
pull(treatment)
# Show inconsistent subjects BEFORE mutation
cat("### Data before Mutation:\n")
## ### Data before Mutation:
tidydata %>%
filter(treatment %in% union(inconsistent_chunkypoop_subjects, inconsistent_oily_subjects)) %>% # union() removes duplicates, unlike c(), but %in% doesn't consider duplicates anyways.
select(treatment, chunkypoop, oily) %>%
arrange(treatment) %>%
knitr::kable()
| treatment | chunkypoop | oily |
|---|---|---|
| 7043 | 1 | 0 |
| 7043 | 1 | 0 |
| 7043 | 0 | 0 |
| 7043 | 0 | 0 |
| 7051 | 0 | 1 |
| 7051 | 0 | 1 |
| 7051 | 1 | 1 |
| 7051 | 1 | 1 |
| 7106 | 1 | 0 |
| 7106 | 1 | 0 |
| 7106 | 0 | 0 |
| 7106 | 0 | 0 |
# Perform mutation
tidydata2 <- tidydata %>%
mutate(
chunkypoop = if_else(treatment %in% inconsistent_chunkypoop_subjects, NA, chunkypoop), # for each value of chunkypoop, if treatment is in the list of inconsistent chunkypoop subjects, then it is changed to NA, otherwise, it stays the same
oily = if_else(treatment %in% inconsistent_oily_subjects, NA, oily)
)
# Show AFTER mutation
cat("### Data after Mutation:\n")
## ### Data after Mutation:
tidydata2 %>%
filter(treatment %in% union(inconsistent_chunkypoop_subjects, inconsistent_oily_subjects)) %>%
select(treatment, chunkypoop, oily) %>%
arrange(treatment) %>%
knitr::kable()
| treatment | chunkypoop | oily |
|---|---|---|
| 7043 | NA | 0 |
| 7043 | NA | 0 |
| 7043 | NA | 0 |
| 7043 | NA | 0 |
| 7051 | NA | 1 |
| 7051 | NA | 1 |
| 7051 | NA | 1 |
| 7051 | NA | 1 |
| 7106 | NA | 0 |
| 7106 | NA | 0 |
| 7106 | NA | 0 |
| 7106 | NA | 0 |
The chunkiness of the poop for subjects 7043, 7051, and 7106 was not consistently documented. I will make the chunkiness field for these subjects NA.
Each experiment conducted technical duplicates of the controls and the fecal waters, so this is the average of each technical duplicate within each batch. The data distribution of the technical replicates will be more useful for assessing the experimental design than for answering biological questions.
# find the average of each *batch's* control assays
avg_tech_dups <- tidydata2 %>%
group_by(experiment, treatment) %>% # group by experiment, then treatment, to get the average of the technical duplicates
summarise( # collapse the groups
mean_percTEERdelta = mean(percTEERdelta, na.rm = TRUE),
mean_IL8 = mean(IL8, na.rm = TRUE),
.groups = "drop"
)
Some fecal waters were experimented more than once, so this is the average of all technical and biological replicates of the fecal water treatments. The data distribution of all replicates of each subject will be used to infer biological patterns rather than assessing the experimental design.
# create a tibble for analyzing the fecal water experiments
avg_bio_reps_fecal <- tidydata2 %>%
filter(!treatment %in% c("nctrl", "pctrl")) %>%
group_by(treatment) %>%
summarise(
mean_percTEERdelta = mean(percTEERdelta, na.rm = TRUE),
mean_IL8 = mean(IL8, na.rm = TRUE),
chunkypoop = first(chunkypoop),
oily = first(oily),
.groups = "drop"
)
# Find the average and standard deviation of the fecal waters assayed more than once
avg_bio_reps <- tidydata2 %>%
group_by(treatment) %>%
summarise(
mean_percTEERdelta = mean(percTEERdelta, na.rm = TRUE),
sd_percTEERdelta = sd(percTEERdelta, na.rm = TRUE),
mean_IL8 = mean(IL8, na.rm = TRUE),
sd_IL8 = sd(IL8, na.rm = TRUE),
.groups = "drop"
)
avg_fbioreps_ctechreps = full_join(
avg_tech_dups %>%
dplyr::select(treatment, mean_percTEERdelta, mean_IL8) %>%
filter(treatment %in% c("nctrl", "pctrl")),
avg_bio_reps_fecal,
by = join_by(treatment, mean_percTEERdelta, mean_IL8))
knitr::kable(avg_bio_reps %>% head())
| treatment | mean_percTEERdelta | sd_percTEERdelta | mean_IL8 | sd_IL8 |
|---|---|---|---|---|
| 5001 | 15.345 | 1.788980 | 4.710 | 0.8626703 |
| 5005 | -18.600 | 5.176022 | 29.160 | 1.1313708 |
| 5006 | 16.750 | 7.226631 | 40.995 | 1.6899852 |
| 5007 | 15.950 | 1.640488 | 33.535 | 3.3728993 |
| 5009 | -3.130 | 2.913280 | 5.325 | 0.0070711 |
| 5010 | 22.675 | 3.669884 | 24.565 | 1.9445436 |
scale_factor_teer1 <- 85
scale_factor_il81 <- 1450
scale_factor_teer2 <- 82
scale_factor_il82 <- 220
p1 = ggplot(avg_tech_dups %>% filter(treatment %in% c("nctrl")), # take data filtered for neg ctrl data
aes(x = mean_IL8)) + # plot only mean_IL8 on the x axis. "x" is hard coded into geom_histogram(), thus `aes(x = mean_IL8)`
geom_histogram(binwidth = 1, fill = "lightblue", color = "black") + # add a histogram
geom_density( # add a density plot
aes(y = after_stat(density) * scale_factor_teer1), # compute density values, then multiply by a scaling factor
color = "darkblue",
linewidth = 1
) +
scale_y_continuous( # function for defining the y axes
name = "Count", # name the primary y-axis
sec.axis = # argument for creating a second axis on the right
sec_axis( # fxn to create an anonymous object (also could create a named object somewhere else)
~ . / scale_factor_teer1, name = "Density") # define the second axis as a fxn that takes the primary y-axis and divides it by the scaling factor, and then name the axis "Density"
) +
labs(
title = paste0("Histogram of the replicate-averaged IL8 concentration\nin the no-change controls"),
x = "IL-8 pg/mL",
y = "Count") +
theme_minimal() + # minimize visual clutter
theme(plot.title = element_text(size = 14))
p2 = ggplot(avg_tech_dups %>% filter(treatment %in% c("pctrl")), aes(x = mean_IL8)) +
geom_histogram(binwidth = 20, fill = "lightblue", color = "black") +
geom_density(
aes(y = after_stat(density) * scale_factor_il81),
color = "darkblue",
linewidth = 1
) +
scale_y_continuous(
name = "Count",
sec.axis = sec_axis(~ . / scale_factor_il81, name = "Density")
) +
labs(title = paste0("Histogram of the replicate-averaged IL8 concentration\nin the pro-inflammatory controls"),
x = "IL-8 pg/mL",
y = "Count") +
theme_minimal() +
theme(plot.title = element_text(size = 14))
p3 = ggplot(avg_tech_dups %>%
filter(treatment %in% c("nctrl")), aes(x = mean_percTEERdelta)) +
geom_histogram(binwidth = 1, fill = "lightblue", color = "black") +
geom_density(
aes(y = after_stat(density) * scale_factor_teer2),
color = "darkblue",
linewidth = 1
) +
scale_y_continuous(
name = "Count",
sec.axis = sec_axis(~ . / scale_factor_teer2, name = "Density")
) +
labs(title = paste0("Histogram of the replicate-averaged percent TEER change\nin the no-change controls"),
x = "%"~Delta~"TEER",
y = "Count") +
theme_minimal() +
theme(plot.title = element_text(size = 14))
p4 = ggplot(avg_tech_dups %>%
filter(treatment %in% c("pctrl")), aes(x = mean_percTEERdelta)) +
geom_histogram(binwidth = 3, fill = "lightblue", color = "black") +
geom_density(
aes(y = after_stat(density) * scale_factor_il82),
color = "darkblue",
linewidth = 1
) +
scale_y_continuous(
name = "Count",
sec.axis = sec_axis(~ . / scale_factor_il82, name = "Density")
) +
labs(title = paste0("Histogram of the replicate-averaged percent TEER change\nin the pro-inflammatory controls"),
x = "%"~Delta~"TEER",
y = "Count") +
theme_minimal() +
theme(plot.title = element_text(size = 14))
(p1 + p2) / (p3 + p4)
Because I know ahead of time that I want to facet the QQ plots and
the box plots of the control data, I need to pivot the tidy data. I’m
pipe the pivot() to mutate() to generate a new field, which will be a
label for each facet (subplot) within the larger plot. If I didn’t
facet, I would create a plot for each data source (i.e. ncontrol TEER,
poscontrol TEER, ncontrol IL-8, poscontrol IL-8) Note: the above
histograms and density plots are not built from faceted data because of
the double y-axis. Instead, separate plots are generated and then
faceted together with patchwork
avg_ctrl_tech_dups_long <- avg_tech_dups %>%
dplyr::select(treatment, mean_percTEERdelta, mean_IL8) %>%
filter(treatment %in% c("nctrl", "pctrl")) %>%
pivot_longer(
cols = c(mean_percTEERdelta, mean_IL8),
names_to = "variable",
values_to = "value"
) %>%
mutate(facet_label = paste(variable, treatment, sep = "_"))
# Generate Q-Q plots
ggplot(avg_ctrl_tech_dups_long, aes(sample = value)) +
stat_qq() + # stat_qq() has the "sample" variable hard-coded, thus, `aes(sample = value)`
stat_qq_line() +
facet_wrap( # facet_wrap() is a ggplot2 function that splits your data into multiple subplots (small multiples) based on a categorical variable, wrapping them into a grid layout. That categorical value is what I created in the pivot() command
vars(facet_label), # the first argument of facet_wrap() is the label-by, which is being defined by the vars() function, which produces a ggplot-specific object that ggplot knows what to do with
scales = "free_y") + # IL-8 and TEER change have different scales, so R will automatically figure out the scale
theme_minimal() +
labs(
title = "Q-Q Plots for control replicate-averaged IL-8 conc. and %"~Delta~"TEER",
x = "Theoretical Quantiles",
y = "Sample Quantiles"
)
# boxplot
ggplot(avg_ctrl_tech_dups_long, aes(x = treatment, y = value)) + # geom_boxplot() has the "x" variable hard-coded, thus, `aes(x = treatment, y = value)`
geom_boxplot() +
facet_wrap(
vars(facet_label),
scales = "free_y",
nrow = 1) + # i like the aesthetic better of a single row of boxplots, despite the individually-scaled y-axes
theme_minimal() +
theme(axis.text.x = element_blank()) + # eliminate default axis label
#strip.text = element_text(face = "bold")
labs(
title = "Boxplots for control replicate-averaged IL8 conc. and %"~Delta~"TEER",
x = NULL,
y = "Value"
)
Variance quantifies how dispersed the data is. The mean is subtracted from each datapoint, then squared, and then the average is taken.
Standard deviation also quantifies the dispersion of the data because it is the square root of the variance, thus, it is in the same units as the data.
Coefficient of variation makes relevant the size of the standard deviation relative to the mean, because it is calculated as proportion of the mean that is the standard deviation. Thus, it is comparable between measures and datasets. This is only applicable to positive values whose mean is not close to zero (i.e. not applicable to the mean percent change in TEER)
Shapiro-Wilk test for normality quantifies how likely it is that the data is non-normal.
Skewness informs why non-normal data is non-normal by quantifying how asymetric or skewed the data is. Could inform the necessity of transformation. Right-skewed, aka positive skew, is indicated by the “skewness” being greater than 0.
Kurtosis tells you about tail behavior and outlier risk. A value equal to 3 means no excess kurtosis (likely normal distribution), greater than 3 means “leptokurtic” heavy tails, and lesser than 3 means “platykurtic” light tails.
Jarque-Bera test for normality quantifies how likely it is that the data is non-normal by combining skewness and kurtosis. It is therefore less sensitive to large sample sizes, and wont pick up non-normality due to reasons other than skewness or kurtosis. It doesn’t quantify skewness and kurtosis, respectively, but is interpreted as an “or”.
# Function to compute all stats.
compute_stats <- function(x) {
tibble(
mean = round(mean(x, na.rm = TRUE), 3),
median = round(median(x, na.rm = TRUE), 3),
min = min(x, na.rm = TRUE),
max = max(x, na.rm = TRUE),
var = round(var(x, na.rm = TRUE), 3),
sd = round(sd(x, na.rm = TRUE), 3),
cv = round(sd/abs(mean), 3),
shapiro_p = round(tryCatch( # tryCatch returns NA instead of erroring out
shapiro.test(x)$p.value,
error = function(e) NA
),
3),
skew = round(moments::skewness(x), 3),
kurt = round(moments::kurtosis(x), 3),
jarq_p = round(tryCatch(
jarque.test(x)$p.value,
error = function(e) NA
),
3)
)
}
avg_ctrl_tech_dups_stats = avg_ctrl_tech_dups_long %>%
group_by(treatment, variable) %>% # label the untidy data so that `summarize()` knows what data to collapse and thus apply to the statistical test functions
summarise(compute_stats(value))
## `summarise()` has grouped output by 'treatment'. You can override using the
## `.groups` argument.
knitr::kable(avg_ctrl_tech_dups_stats) # collapse the groups and do unto each row the `compute_stats` function created above. Reminder: `summarize()` inherits the columns specified by "group_by", so they are not removed by summarize.
| treatment | variable | mean | median | min | max | var | sd | cv | shapiro_p | skew | kurt | jarq_p |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| nctrl | mean_IL8 | 6.537 | 6.475 | 0.000 | 20.965 | 19.926 | 4.464 | 0.683 | 0.032 | 0.405 | 3.318 | 0.394 |
| nctrl | mean_percTEERdelta | -6.021 | -6.305 | -12.850 | 0.900 | 8.325 | 2.885 | 0.479 | 0.787 | 0.099 | 3.162 | 0.922 |
| pctrl | mean_IL8 | 152.740 | 144.590 | 63.075 | 320.500 | 3179.381 | 56.386 | 0.369 | 0.013 | 0.827 | 3.324 | 0.031 |
| pctrl | mean_percTEERdelta | -29.638 | -28.790 | -49.295 | -14.665 | 38.312 | 6.190 | 0.209 | 0.090 | -0.654 | 4.179 | 0.022 |
The no-change control did a good job at maintaining permeability and keeping inflammation consistently low, at least within a much smaller range than the pro-inflammatory control, which was not very consistent in how much inflammation and permeability it caused. In general, in the controls, IL-8 production is more variable than permeability in response to challenge.
Experiments were done in batches of several fecal samples from 2018-2024, and because of the nature of cell culture experiments, Zeynep suggests that the TEER data need to be normalized by the no-change control within batches before batches are analyzed together. This may account for whatever systematic error was imparted on the entire batch. After normalization, replicates would need to be averaged. However, this makes the normalized treatment group values co-vary with the no-change control group, making comparisons between the two groups more difficult. In addition, the no-change control group does very little to inflammation and permeability. I will not normalize by the negative control yet, but here is the code to do so.
# mean of only the control data
avg_ctrls <- tidydata2 %>%
select(experiment, treatment, percTEERdelta, IL8) %>%
filter(treatment %in% c("nctrl", "pctrl")) %>%
group_by(experiment) %>%
summarise(
avg_percTEERdelta_nctrl = mean(percTEERdelta[treatment == "nctrl"], na.rm = TRUE),
avg_percTEERdelta_pctrl = mean(percTEERdelta[treatment == "pctrl"], na.rm = TRUE),
avg_IL8_nctrl = mean(IL8[treatment == "nctrl"], na.rm = TRUE),
avg_IL8_pctrl = mean(IL8[treatment == "pctrl"], na.rm = TRUE),
.groups = "drop"
)
data_norm <- tidydata2 %>%
left_join(
avg_ctrls %>% select(experiment, avg_percTEERdelta_nctrl, avg_IL8_nctrl),
by = "experiment"
) %>%
mutate(
percTEERdelta_cor = percTEERdelta - avg_percTEERdelta_nctrl,
IL8_cor = IL8 - avg_IL8_nctrl
) %>%
select(-avg_percTEERdelta_nctrl, -avg_IL8_nctrl, -percTEERdelta_nctrl_ave) # Optional: drop these columns if you don't need them
knitr::kable(head(data_norm))
knitr::kable(tail(data_norm))
data_norm_mean <- data_norm %>%
filter(!treatment %in% c("nctrl", "pctrl")) %>%
group_by(treatment) %>%
summarise(
avg_percTEERdelta_norm = mean(percTEERdelta_cor, na.rm = TRUE),
sd_percTEERdelta_norm = sd(percTEERdelta_cor, na.rm = TRUE),
avg_IL8_norm = mean(IL8_cor, na.rm = TRUE),
sd_IL8_norm = sd(IL8_cor, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(avg_percTEERdelta_norm))
write.csv(data_norm_mean,file.path(data_dir, "/delete.csv", row.names = FALSE)
knitr::kable(data_norm_mean)
# Set a scaling factor so density visually aligns with histogram
scale_factor_teer <- 3350
scale_factor_il8 <- 1300
p1 <- ggplot(avg_bio_reps_fecal, aes(x = mean_percTEERdelta)) +
geom_histogram(binwidth = 10, fill = "lightblue", color = "black") +
geom_density(
aes(y = after_stat(density) * scale_factor_teer),
color = "darkblue",
linewidth = 1
) +
scale_y_continuous(
name = "Count",
sec.axis = sec_axis(~ . / scale_factor_teer, name = "Density")
) +
labs(
title = "Histogram of the replicate-averaged percent TEER change\nin the fecal water treatment group",
x = "%"~Delta~"TEER"
) +
theme_minimal()
p2 <- ggplot(avg_bio_reps_fecal, aes(x = mean_IL8)) +
geom_histogram(binwidth = 3, fill = "lightblue", color = "black") +
geom_density(
aes(y = after_stat(density) * scale_factor_il8),
color = "darkblue",
linewidth = 1
) +
scale_y_continuous(
name = "Count",
sec.axis = sec_axis(~ . / scale_factor_il8, name = "Density")
) +
labs(
title = "Histogram of the replicate-averaged IL8 concentration\nin the fecal water treatment group",
x = "IL8"
) +
theme_minimal()
# Combine
p1 + p2
# Convert to long format so all columns can be plotted at once
avg_data_treat_long <- avg_bio_reps_fecal %>%
select(mean_percTEERdelta, mean_IL8) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "value"
)
ggplot(avg_data_treat_long, aes(sample = value)) +
stat_qq() +
stat_qq_line() +
facet_wrap(~variable, scales = "free", nrow = 1) +
theme_minimal() +
labs(
title = "Q–Q Plots for replicate-avgd IL8 conc. and %"~Delta~"TEER of fecal water treatments",
x = "Theoretical Quantiles",
y = "Sample Quantiles"
)
# Boxplot the treatment group
ggplot(avg_data_treat_long, aes(x = "", y = value)) +
geom_boxplot() +
facet_wrap(~variable, scales = "free_y", nrow = 1) +
theme_minimal() +
labs(
title = "Boxplots for replicate-avgd IL8 conc. and %"~Delta~"TEER of fecal water treatments",
x = NULL,
y = "Value"
)
avg_fbioreps_ctechreps_long <- avg_fbioreps_ctechreps %>%
select(-oily, -chunkypoop) %>%
pivot_longer(-treatment, names_to = "variable", values_to = "value") %>%
mutate(
treatment_group = case_when(
treatment == "nctrl" ~ "nctrl",
treatment == "pctrl" ~ "pctrl",
TRUE ~ "fecal"
),
# Set factor order here
treatment_group = factor(treatment_group, levels = c("nctrl", "fecal", "pctrl"))
)
# Boxplot the treatment group in the context of the controls
ggplot(avg_fbioreps_ctechreps_long, aes(x = treatment_group, y = value)) +
geom_boxplot(fill = "lightblue", color = "black") +
facet_wrap(~variable, scales = "free_y", labeller = as_labeller(c(mean_percTEERdelta = "% Δ TEER", mean_IL8 = "IL-8 (pg/mL)"))) +
theme_minimal() +
labs(
title = "Boxplots contextualizing treatment group values with those of the controls",
x = NULL,
y = NULL # remove shared y-axis label
)
density_data_prep_teer <- avg_fbioreps_ctechreps %>%
filter(!is.na(mean_percTEERdelta)) %>%
mutate(group = case_when(
treatment == "nctrl" ~ "No-change Control",
treatment == "pctrl" ~ "Pro-inflammatory Control",
TRUE ~ "Fecal Water Treatment"
))
tertiles_TEER <- quantile(avg_fbioreps_ctechreps %>%
filter(!treatment %in% c("nctrl", "pctrl")) %>%
pull(mean_percTEERdelta),
probs = c(0, 1/3, 2/3, 1), na.rm = TRUE)
quartiles_TEER <- quantile(avg_fbioreps_ctechreps %>%
filter(!treatment %in% c("nctrl", "pctrl")) %>%
pull(mean_percTEERdelta),
probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)
cat("mean_percTEERdelta quartiles")
## mean_percTEERdelta quartiles
print(quartiles_TEER)
## 0% 25% 50% 75% 100%
## -100.0000 -5.2775 10.6650 23.5625 210.5200
cat("mean_percTEERdelta tertiles")
## mean_percTEERdelta tertiles
print(tertiles_TEER)
## 0% 33.33333% 66.66667% 100%
## -100.000000 1.323333 17.066667 210.520000
# Compute density per group, restricted to group range
density_data_teer <- density_data_prep_teer %>%
group_by(group) %>%
summarise(density = list({
x <- mean_percTEERdelta
dens <- density(x, from = min(x), to = max(x), n = 512)
tibble(x = dens$x, y = dens$y)
}), .groups = "drop") %>%
unnest(density)
# Min and max lines
minmax_lines <- density_data_prep_teer %>%
group_by(group) %>%
summarise(
min_val = min(mean_percTEERdelta, na.rm = TRUE),
max_val = max(mean_percTEERdelta, na.rm = TRUE),
.groups = "drop"
) %>%
pivot_longer(cols = c(min_val, max_val), names_to = "bound", values_to = "x")
# Define the full plot
main_plot <- ggplot(density_data_prep_teer, aes(x = mean_percTEERdelta)) +
geom_histogram(aes(y = after_stat(density), fill = group),
position = "identity", binwidth = 3,
alpha = 0.4, color = "black") +
geom_line(data = density_data_teer, aes(x = x, y = y, color = group), linewidth = 1.2) +
geom_vline(data = minmax_lines, aes(xintercept = x, color = group),
linetype = "dashed", linewidth = 0.8, alpha = 0.8) +
geom_text(
data = minmax_lines,
aes(x = x + 5, y = 0.15, label = round(x, 1), color = group),
angle = 0, vjust = -0.5, size = 5, show.legend = FALSE
) +
labs(
title = "Overlayed Distribution of Replicate-averaged %\u0394TEER",
caption = paste0("The min and max of the groups are shown with a labeled and dashed line colored by group, and the boundaries of the second tertile are shown by black lines.\n", " Note that the minimum value of the second tertile happens to be almost exactly the maximum value of the no-change control.\n"),
x = "% \u0394TEER",
y = "Density",
fill = "Group",
color = "Group"
) +
scale_x_continuous(
breaks = seq(-200, 200, by = 50),
minor_breaks = seq(-200, 200, by = 10)
) +
theme_minimal() +
theme(
legend.position.inside = c(0.06, 0.5),
legend.justification = c("left", "top"),
legend.background = element_rect(fill = "white", color = "black")
)
# Define the zoomed-in inset
inset_plot <- ggplot(density_data_prep_teer, aes(x = mean_percTEERdelta)) +
geom_histogram(aes(y = after_stat(density), fill = group),
position = "identity", binwidth = 3,
alpha = 0.4, color = "black",show.legend = F) +
geom_line(data = density_data_teer, aes(x = x, y = y, color = group), linewidth = 1.2,show.legend = F) +
geom_vline(data = minmax_lines, aes(xintercept = x, color = group),
linetype = "dashed", linewidth = 0.8, alpha = 0.8,show.legend = F) +
geom_vline(xintercept = c(tertiles_TEER[[2]], tertiles_TEER[[3]]),
color = "black", linewidth = 1.8, alpha = 0.5) +
coord_cartesian(ylim = c(0, 0.025), xlim = c(-50, 70)) +
scale_x_continuous(breaks = seq(-50, 70, by = 10)) +
labs(
title = NULL, x = NULL, y = NULL
) +
theme_minimal(base_size = 8) +
theme(
panel.grid = element_blank(),
axis.ticks = element_line(),
axis.line = element_line()
)
final_plot <- ggdraw() +
draw_plot(main_plot) +
draw_plot(inset_plot, x = 0.42, y = 0.4, width = 0.5, height = 0.5)
final_plot
density_data_prep_il8 <- avg_fbioreps_ctechreps %>%
filter(!is.na(mean_IL8)) %>%
mutate(group = case_when(
treatment == "nctrl" ~ "No-change Control",
treatment == "pctrl" ~ "Pro-inflammatory Control",
TRUE ~ "Fecal Water Treatment"
))
# Compute density per group, restricted to group range
density_data_il8 <- density_data_prep_il8 %>%
group_by(group) %>%
summarise(density = list({
x <- mean_IL8
dens <- density(x, from = min(x), to = max(x), n = 512)
tibble(x = dens$x, y = dens$y)
}), .groups = "drop") %>%
unnest(density)
# Min and max lines
minmax_lines_il8 <- density_data_prep_il8 %>%
group_by(group) %>%
summarise(
min_val = min(mean_IL8, na.rm = TRUE),
max_val = max(mean_IL8, na.rm = TRUE),
.groups = "drop"
) %>%
pivot_longer(cols = c(min_val, max_val), names_to = "bound", values_to = "x")
# Define the full plot
main_plot_il8 <- ggplot(density_data_prep_il8, aes(x = mean_IL8)) +
geom_histogram(aes(y = after_stat(density), fill = group),
position = "identity", binwidth = 3,
alpha = 0.4, color = "black") +
geom_line(data = density_data_il8, aes(x = x, y = y, color = group), linewidth = 1.2) +
geom_vline(data = minmax_lines_il8, aes(xintercept = x, color = group),
linetype = "dashed", linewidth = 0.8, alpha = 0.8) +
geom_text(
data = minmax_lines_il8,
aes(x = x + 5, y = 0.10, label = round(x, 1), color = group),
angle = 0, vjust = -0.5, size = 5, show.legend = FALSE
) +
labs(
title = "Overlayed Distribution of Replicate-averaged IL-8",
x = "IL-8",
y = "Density",
fill = "Group",
color = "Group"
) +
scale_x_continuous(
breaks = seq(0, 350, by = 50),
minor_breaks = seq(0, 350, by = 10)
) +
theme_minimal() +
theme(
legend.position.inside = c(0.3, 0.55),
legend.justification = c("left", "top"),
legend.background = element_rect(fill = "white", color = "black")
)
# Define the zoomed-in inset
inset_plot_il8 <- ggplot(density_data_prep_il8, aes(x = mean_IL8)) +
geom_histogram(aes(y = after_stat(density), fill = group),
position = "identity", binwidth = 3,
alpha = 0.4, color = "black",
show.legend = F) +
geom_line(data = density_data_il8, aes(x = x, y = y, color = group), linewidth = 1.2, show.legend = F) +
geom_vline(data = minmax_lines_il8, aes(xintercept = x, color = group),
linetype = "dashed", linewidth = 0.8, alpha = 0.8, show.legend = F) +
coord_cartesian(ylim = c(0, 0.025), xlim = c(0, 70)) +
scale_x_continuous(breaks = seq(0, 70, by = 10)) +
labs(
title = NULL, x = NULL, y = NULL
) +
theme_minimal(base_size = 8) +
theme(
panel.grid = element_blank(),
axis.ticks = element_line(),
axis.line = element_line()
)
final_plot_il8 <- ggdraw() +
draw_plot(main_plot_il8) +
draw_plot(inset_plot_il8, x = 0.6, y = 0.4, width = 0.33, height = 0.5)
final_plot_il8
ggsave(file.path(data_dir, "/PLOTS/TEER_distribution.png"), plot = final_plot,
width = 12, height = 8, dpi = 300, units = "in", create.dir = TRUE)
ggsave(file.path(data_dir, "/PLOTS/IL8_distribution.png"), plot = final_plot_il8,
width = 12, height = 8, dpi = 300, units = "in", create.dir = TRUE)
# Method 2: PDF files (vector format, great for publications)
ggsave(file.path(data_dir, "/PLOTS/TEER_distribution.pdf"), plot = final_plot,
width = 12, height = 8, units = "in", create.dir = TRUE)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on '% ΔTEER' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'Overlayed Distribution of Replicate-averaged %ΔTEER' in
## 'mbcsToSbcs': for Δ (U+0394)
ggsave(file.path(data_dir, "/PLOTS/IL8_distribution.pdf"), plot = final_plot_il8,
width = 12, height = 8, units = "in", create.dir = TRUE)
# # Method 3: SVG files (vector format, web-friendly)
# ggsave(file.path(data_dir, "/PLOTS/TEER_distribution.svg"), plot = final_plot,
# width = 12, height = 8, units = "in", create.dir = TRUE)
#
# ggsave(file.path(data_dir, "/PLOTS/IL8_distribution.svg"), plot = final_plot_il8,
# width = 12, height = 8, units = "in", create.dir = TRUE)
# # Method 4: TIFF files (high quality, often required by journals)
# ggsave(file.path(data_dir, "/PLOTS/TEER_distribution.tiff"), plot = final_plot,
# width = 12, height = 8, dpi = 300, units = "in", compression = "lzw", create.dir = TRUE)
#
# ggsave(file.path(data_dir, "/PLOTS/IL8_distribution.tiff"), plot = final_plot_il8,
# width = 12, height = 8, dpi = 300, units = "in", compression = "lzw", create.dir = TRUE)
See the small gap between the least permeability-inducing pro-inflammatory control duplicate average and the most permeability-inducing no-change control duplicate average, where some fecal waters reside.
teer_min_nctrl = avg_ctrl_tech_dups_stats %>%
filter(treatment == "nctrl" & variable == "mean_percTEERdelta") %>%
pull(min)
teer_max_pctrl = avg_ctrl_tech_dups_stats %>%
filter(treatment == "pctrl" & variable == "mean_percTEERdelta") %>%
pull(max)
avg_fbioreps_ctechreps %>%
filter(mean_percTEERdelta <=
avg_ctrl_tech_dups_stats %>%
filter(treatment == "nctrl" & variable == "mean_percTEERdelta") %>%
pull(min)) %>%
filter(mean_percTEERdelta >=
avg_ctrl_tech_dups_stats %>%
filter(treatment == "pctrl" & variable == "mean_percTEERdelta") %>%
pull(max)) %>%
dplyr::select(., c(treatment, mean_percTEERdelta)) %>%
arrange(mean_percTEERdelta)
## # A tibble: 7 × 2
## treatment mean_percTEERdelta
## <chr> <dbl>
## 1 pctrl -14.7
## 2 8081 -14.4
## 3 8008 -14.1
## 4 8108 -13.7
## 5 8093 -13.3
## 6 8007 -13.0
## 7 nctrl -12.8
avg_fbioreps_ctechreps %>%
filter(mean_percTEERdelta <= teer_min_nctrl) %>%
filter(mean_percTEERdelta >= teer_max_pctrl) %>%
dplyr::select(., c(treatment, mean_percTEERdelta)) %>%
arrange(mean_percTEERdelta)
## # A tibble: 7 × 2
## treatment mean_percTEERdelta
## <chr> <dbl>
## 1 pctrl -14.7
## 2 8081 -14.4
## 3 8008 -14.1
## 4 8108 -13.7
## 5 8093 -13.3
## 6 8007 -13.0
## 7 nctrl -12.8
I am choosing to proceed the biological replicate-averaged values of each fecal water because I am most-interested in how each fecal water affects CACO-2 cells, not the distribution of the affect of each replicate. My goal is to understand why the fecal waters have the affect they do, not question the validity of the experimental design.
| variable | mean | median | min | max | var | sd | cv | shapiro_p | skew | kurt | jarq_p |
|---|---|---|---|---|---|---|---|---|---|---|---|
| mean_IL8 | 22.270 | 14.335 | 0 | 180.3475 | 606.945 | 24.636 | 1.106 | 0 | 3.215 | 16.234 | 0 |
| mean_percTEERdelta | 10.818 | 10.665 | -100 | 210.5200 | 984.990 | 31.385 | 2.901 | 0 | 1.755 | 13.189 | 0 |
This block is copied from Dr. Andrew Oliver
path_plasma_input <- file.path(data_dir3, "plasma_scfas.csv")
path_plasma_output <- file.path(data_dir3, "plasma_scfas_processed.csv")
path_fecal_input <- file.path(data_dir3, "fecal_scfa_fl100.csv")
path_new_but_isobut <- file.path(data_dir3, "butyrate_isob_new_integration_12-06-22.csv")
path_fecal_output <- file.path(data_dir3, "fecal_scfas_processed.csv")
path_fecal_vars <- file.path(data_dir3, "FL100_stool_variables.txt")
`%!in%` <- Negate(`%in%`)
## read in PLASMA SCFA data
scfa_plasma <- readr::read_delim(path_plasma_input, delim = ",", col_types = "fdddf")
## any missing or NA in propionate or butyrate, make it 10
## this is because you supply 10 as the value that is left-censored in
## censored models.
###CHAD SAYS: to deal with rank statistical tests, might want to collapse samples equal to 10
scfa_plasma[c("p_butyric_acid", "p_propionic_acid")][is.na(scfa_plasma[c("p_butyric_acid", "p_propionic_acid")])] <- 10
## create a list of subjects whose plasma samples we have multiples of
scfa_duplicated <- scfa_plasma %>%
dplyr::filter(., duplicates != "") %>%
dplyr::pull(., subject_id) %>%
droplevels()
## create a empty DF in which to place the average of duplicated samples
deduplicated <- data.frame(subject_id=character(),
p_acetic_acid=numeric(),
p_propionic_acid=numeric(),
p_butyric_acid=numeric())
## loop through duplicated samples
scfas <- c("p_acetic_acid", "p_propionic_acid", "p_butyric_acid")
for (subject in unique(scfa_duplicated)) {
## for each duplicated sample, pull it out and all the duplicated SCFA data
scfa_duplicated_tmp <- scfa_plasma %>%
dplyr::filter(., subject_id == subject) %>%
dplyr::select(., subject_id, p_acetic_acid, p_propionic_acid, p_butyric_acid)
## check and make sure you dont have a SCFA that is completely missing data
## you shouldnt with this data
if (max((sum(is.na(scfa_duplicated_tmp$p_acetic_acid))),
(sum(is.na(scfa_duplicated_tmp$p_propionic_acid))),
(sum(is.na(scfa_duplicated_tmp$p_butyric_acid)))) > NROW(scfa_duplicated_tmp)) {
stop("You have duplicated subject_id with completely missing SCFA data for at least one SCFA")
}
## create a vector of the mean SCFA abundance of the values you have,
## ignoring any NAs
missing_means <- colMeans(scfa_duplicated_tmp[2:4], na.rm = T)
count = 1
## add this SCFA mean to the formerly empty, deduplicated DF
deduplicated <- deduplicated %>% tibble::add_row(., subject_id=subject,
p_acetic_acid=missing_means[1],
p_propionic_acid=missing_means[2],
p_butyric_acid=missing_means[3])
}
## remove duplicated from raw data and add back in averaged values
scfa_plasma_dedup <- scfa_plasma %>% dplyr::select(., -duplicates) %>%
dplyr::filter(., subject_id %!in% scfa_duplicated)
scfa_plasma_dedup <- rbind(scfa_plasma_dedup, deduplicated)
## convert ng/mL to nmole/ul because fecal units are nmole/mg
scfa_plasma_dedup$p_butyric_acid_nmol <- scfa_plasma_dedup$p_butyric_acid / (1000 * 88.11)
scfa_plasma_dedup$p_acetic_acid_nmol <- scfa_plasma_dedup$p_acetic_acid / (1000 * 60.052)
scfa_plasma_dedup$p_propionic_acid_nmol <- scfa_plasma_dedup$p_propionic_acid / (1000 * 74.08)
## total here
scfa_plasma_dedup$p_scfa_nmol_total <- scfa_plasma_dedup$p_butyric_acid_nmol +
scfa_plasma_dedup$p_acetic_acid_nmol +
scfa_plasma_dedup$p_propionic_acid_nmol
## create relative abundance values
scfa_plasma_dedup$p_butyric_acid_nmol_norm <- scfa_plasma_dedup$p_butyric_acid_nmol / scfa_plasma_dedup$p_scfa_nmol_total
scfa_plasma_dedup$p_acetic_acid_nmol_norm <- scfa_plasma_dedup$p_acetic_acid_nmol / scfa_plasma_dedup$p_scfa_nmol_total
scfa_plasma_dedup$p_propionic_acid_nmol_norm <- scfa_plasma_dedup$p_propionic_acid_nmol / scfa_plasma_dedup$p_scfa_nmol_total
## READ IN FECAL SCFA
fecal_scfas <- readr::read_delim(path_fecal_input, delim = ",", col_types = "fdddd") %>% dplyr::select(., -butyrate)
new_fecal_butyrate <- readr::read_delim(path_new_but_isobut, delim = ",", col_types = "fdd")
fecal_scfas <- merge(new_fecal_butyrate, fecal_scfas, by = "subject_id")
fecal_scfas <- fecal_scfas %>% rename(., "butyrate" = "new_butyrate", "isobutyrate" = "new_isobutyrate")
fecal_vars <- read.delim(path_fecal_vars)
## get rid of fecal samples that are >24 hrs or after visit 1
fecal_vars <- fecal_vars %>%
dplyr::filter(., diff_time_hrs < 24) %>%
dplyr::filter(., AfterV2 == 0)
fecal_scfas <- fecal_scfas %>% dplyr::filter(., subject_id %in% fecal_vars$subject_id)
## take relative abundane of SCFA
fecal_scfas$acetate_norm <- fecal_scfas$acetate / fecal_scfas$total_scfa
fecal_scfas$butyrate_norm <- fecal_scfas$butyrate / fecal_scfas$total_scfa
fecal_scfas$propionate_norm <- fecal_scfas$propionate / fecal_scfas$total_scfa
left join, so that a datum row must have a subject with TEER and IL-8 data, since TEERless and IL-8less SCFA data is irrelevant to the present analysis.
# join data and metadata
avg_meta = avg_fbioreps_ctechreps %>%
left_join(metadata, by = "treatment")
# join data, metadata, and SCFA data, then discretize TEER into a binary
avg_fecal_meta_SCFA = avg_meta %>%
left_join(
fecal_scfas %>%
rename(treatment = subject_id,
but = butyrate,
isoval = isobutyrate,
prop = propionate,
acet = acetate,
acet_norm = acetate_norm,
prop_norm = propionate_norm,
but_norm = butyrate_norm) %>%
mutate(treatment = as.character(treatment)),
by = "treatment"
)
Danielle and I decided to discretize TEER from fecal water treatment by tertile, the first of which happens to encompass the no-change and pro-inflammatory controls. The categories will be “permeability-protective” and “not permeability-protective” (“protPerm_t”)
avg_fecal_meta_SCFA = avg_fecal_meta_SCFA %>%
mutate(
# discretize based on above or below the most permeable no-change control
protPerm_n = ifelse(treatment %in% c("nctrl", "pctrl"),
NA_integer_,
ifelse(mean_percTEERdelta >= teer_min_nctrl, 1L, 0L)),
# discretize based on above or below the least permeable pro-inflammatory control
protPerm_p = ifelse(treatment %in% c("nctrl", "pctrl"),
NA_integer_,
ifelse(mean_percTEERdelta >= teer_max_pctrl, 1L, 0L)),
# discretize based on value relative to the second and third tertiles (data loss)
protPerm_t = ifelse(treatment %in% c("nctrl", "pctrl"),
NA_integer_,
ifelse(mean_percTEERdelta >= tertiles_TEER[[3]], 1L,
ifelse(mean_percTEERdelta <= tertiles_TEER[[2]], 0L, NA_integer_)))
)
Also, according to Bouzid et al. 2024 J. Nut., “Even among healthy participants, measurements of these markers have a broad range. Using thresholds (>100 μg/g calprotectin; >2000 ng/g myeloperoxidase) suggested by the assay manufacturer, some of the healthy participants exhibited frank GI inflammation. Calprotectin clinical thresholds for fresh stool are usually 50 μg/g, but we selected the higher threshold of 100 μg/g because the samples were frozen and thawed, which can cause cell lysis and falsely elevated calprotectin concentrations. Clinical thresholds have not been established for neopterin and plasma LBP.”
According to the questionable source Singh et al 2025 StatPearls, but also Cleveland Clinic and Mount Sinai, says CRP 1.0 to 10.0 mg/dL indicates moderate inflammation. Mayo Clinic says 8-10 is high. FL100 data is in redcap as ng/mL, which is mg/L. 47 subjects had elevated calprotectin and/or myeloperoxidase. No subjects had C-reactive protein above 80 mg/L (8 mg/dL), but six subjects with elevated calprotectin and/or myeloperoxidase had CRP above 1.
avg_fecal_meta_SCFA_filt <- avg_fecal_meta_SCFA %>%
filter(
!treatment %in% c("nctrl", "pctrl"),
fecal_calprotectin < 100,
fecal_mpo_ugg < 2000)
# Calculate bounds for more-stringent filter
teer_values <- avg_fecal_meta_SCFA %>%
filter(!treatment %in% c("nctrl", "pctrl")) %>%
pull(mean_percTEERdelta)
teer_mean <- mean(teer_values)
teer_sd <- sd(teer_values)
# more stringent filter to remove outliers
avg_fecal_meta_SCFA_filt2 <- avg_fecal_meta_SCFA_filt %>%
filter(between(
mean_percTEERdelta,
teer_mean - 2 * teer_sd,
teer_mean + 2 * teer_sd
))
write.csv(avg_fecal_meta_SCFA,
file.path(data_dir, "data_metadata_joined.csv"),
row.names = FALSE)
write.csv(avg_fecal_meta_SCFA_filt, file.path(data_dir, "treatment_data_metadata_joined_noinflammation.csv"), row.names = FALSE)
cat("<details><summary>Click to expand/collapse table</summary>\n\n")
knitr::kable(avg_fecal_meta_SCFA %>%
filter(!treatment %in% c("nctrl", "pctrl")) %>%
select(-c(Age, Race, Race_description, visit2_date, fecal_date, fecal_time, diff_time_hrs, syst_estro_bcp, screen_contracept, fecal_date_parse)) %>% head())
| treatment | mean_percTEERdelta | mean_IL8 | chunkypoop | oily | BMI | Age.Category | BMI.Category | edu_level | hhincome | avg_total_kcal | avg_total_fiber | avg_total_PRO | hei_asa24_totalveg | hei_asa24_greensandbeans | hei_asa24_totalfruit | hei_asa24_wholefruit | hei_asa24_wholegrain | hei_asa24_totaldairy | hei_asa24_totalproteinfoods | hei_asa24_seaplantproteins | hei_asa24_fattyacids | hei_asa24_sodium | hei_asa24_refinedgrains | hei_asa24_sfat | hei_asa24_addsug | hei_asa24_totalscore | fecal_calprotectin | fecal_neopterin | plasma_lbp_bd1 | fecal_ph | stool_specimen | After24h | bristol_num | st_wt | dist_from_ideal | StoolConsistencyClass | avg_season | avg_total_ALC | fecal_date_month | flu_season | mg_fbr_p_kcal | sex | CRP_BD1mgL | CRP_BD3mgL | CRP_BD4mgL | fecal_mpo_ugg | but | isoval | acet | prop | total_scfa | acet_norm | but_norm | prop_norm | protPerm_n | protPerm_p | protPerm_t |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5001 | 15.345 | 4.710 | 0 | 1 | 20.84 | 1 | 1 | 3 | 1 | 2335.394 | 19.562005 | 127.51781 | 4.654313 | 5.000000 | 0.2811037 | 0.0000000 | 6.4255043 | 4.693745 | 5.000000 | 5.000000 | 7.455219 | 0.0000000 | 10.000000 | 7.1973356 | 10.000000 | 65.70722 | 24.90 | 10.02 | 3.69 | 6.75 | 1 | 0 | 3.5 | 93.90 | 0.5 | normal | spring | 29.00134 | Jun | 0 | 8.376317 | Male | 0.2060 | 0.3075 | 0.2033 | 0.13625 | 9.206765 | 1.751058 | 32.61356 | 8.346758 | 51.31167 | 0.6355974 | 0.1794283 | 0.1626678 | 1 | 1 | NA |
| 5005 | -18.600 | 29.160 | 0 | 0 | 27.54 | 1 | 2 | 6 | 3 | 1217.563 | 7.524632 | 67.24825 | 5.000000 | 2.021267 | 1.3107919 | 2.3830743 | 0.0000000 | 9.672009 | 5.000000 | 0.000000 | 2.616835 | 0.0000000 | 6.080065 | 0.0000000 | 10.000000 | 44.08404 | 54.51 | 7.97 | 7.21 | 7.72 | 1 | 1 | 4.0 | 75.28 | 0.0 | normal | summer | 0.00000 | Jul | 0 | 6.180076 | Female | 4.0842 | 3.6372 | 3.3805 | 0.36100 | NA | NA | NA | NA | NA | NA | NA | NA | 0 | 0 | 0 |
| 5006 | 16.750 | 40.995 | 1 | 0 | 25.93 | 2 | 2 | 5 | 6 | 2230.646 | 37.735921 | 118.47995 | 5.000000 | 5.000000 | 5.0000000 | 5.0000000 | 5.5862219 | 6.625616 | 5.000000 | 5.000000 | 4.537007 | 0.0000000 | 10.000000 | 6.8142537 | 10.000000 | 73.56310 | 58.84 | 6.12 | 3.21 | 7.57 | 1 | 0 | 4.0 | 319.70 | 0.0 | normal | summer | 21.18062 | Jul | 0 | 16.917034 | Female | 0.2162 | 0.2065 | NA | 0.68350 | 13.425268 | 3.478748 | 27.53877 | 6.360774 | 49.95069 | 0.5513191 | 0.2687704 | 0.1273411 | 1 | 1 | NA |
| 5007 | 15.950 | 33.535 | 0 | 0 | 27.71 | 1 | 2 | 6 | 3 | 1983.260 | 17.709209 | 59.45370 | 3.723907 | 5.000000 | 4.5046433 | 0.9665248 | 2.6110611 | 3.331994 | 4.301884 | 5.000000 | 2.857936 | 0.5109623 | 4.466815 | 3.5444117 | 7.124591 | 47.94473 | 23.65 | 37.74 | 6.01 | 6.10 | 1 | 0 | 2.0 | 179.58 | 2.0 | hard | summer | 11.14992 | Jul | 0 | 8.929344 | Female | 0.8661 | 0.7780 | 0.7750 | 0.19575 | 10.858761 | 1.389865 | 33.65036 | 14.974260 | 60.91302 | 0.5524330 | 0.1782667 | 0.2458302 | 1 | 1 | NA |
| 5009 | -3.130 | 5.325 | 0 | 1 | 29.80 | 1 | 2 | 4 | 6 | 1467.796 | 24.633549 | 64.21924 | 5.000000 | 5.000000 | 0.0355587 | 0.0000000 | 3.1342512 | 5.886533 | 5.000000 | 4.945277 | 7.629829 | 0.0000000 | 6.906674 | 0.8628563 | 10.000000 | 54.40098 | 104.22 | 9.87 | 15.30 | 6.86 | 1 | 0 | 2.0 | 59.38 | 2.0 | hard | summer | 0.00000 | Jul | 0 | 16.782681 | Female | 3.2200 | 3.0910 | 3.1485 | 0.71275 | 8.147446 | 1.400664 | 23.73812 | 7.771299 | 40.46712 | 0.5866027 | 0.2013350 | 0.1920398 | 1 | 1 | 0 |
| 5010 | 22.675 | 24.565 | 0 | 1 | 23.67 | 1 | 1 | 4 | 3 | 1383.352 | 6.750370 | 56.42612 | 1.324207 | 0.000000 | 1.3608422 | 2.6293827 | 0.9473065 | 9.287367 | 5.000000 | 1.683898 | 0.000000 | 8.1438816 | 4.788193 | 0.0000000 | 6.251874 | 41.41695 | 41.69 | 11.77 | 7.87 | 6.16 | 1 | 1 | 3.0 | 38.70 | 1.0 | normal | summer | 0.00000 | Aug | 0 | 4.879720 | Female | 1.3160 | 1.5142 | 1.8220 | 0.20900 | NA | NA | NA | NA | NA | NA | NA | NA | 1 | 1 | 1 |
cat("\n</details>")
Discretizing continuous TEER values into “yes” or “no” protective of permeability makes inference and predicability possible with logistic regression and binary random forest, respectively. I want to make sure there is a statistically significant difference in continuous TEER between the two TEER groups.
# Prepare the data
plot_data <- avg_fecal_meta_SCFA %>%
filter(!is.na(protPerm_t)) %>%
filter(!treatment %in% c("nctrl", "pctrl")) %>%
select(protPerm_t, value = all_of("mean_percTEERdelta"))
# Group summaries
group_summaries <- plot_data %>%
group_by(protPerm_t) %>%
summarise(
n = n(),
mean = round(mean(value), 2),
median = round(median(value), 2),
.groups = "drop"
)
# Wilcoxon test
wilcox_p <- wilcox.test(value ~ protPerm_t, data = plot_data, alternative="less")$p.value
p_label <- paste0("Wilcoxon (less); p = ", signif(wilcox_p, 3))
# Plot
ggplot(plot_data, aes(x = as.factor(protPerm_t), y = value)) +
geom_boxplot() +
labs(
title = "Permeability-protectiveness by permeability change",
x = "Below 1st tertile or above 3rd tertile",
y = "mean percent TEER change over 24 hours"
) +
geom_text(
data = group_summaries,
aes(
x = as.numeric(protPerm_t) + 1.3,
y = quantile(plot_data$value, 0.75, na.rm = TRUE) + 15,
label = paste0("Mean = ", mean, "\nMedian = ", median)
),
size = 3.5,
vjust = 0,
inherit.aes = FALSE
) +
annotate("text", x = 1.5, y = 220, label = p_label, size = 3)
df = avg_fecal_meta_SCFA %>%
select(Age, Age.Category, protPerm_t, BMI, BMI.Category, sex, StoolConsistencyClass) %>% na.omit()
df %>% count(protPerm_t, BMI.Category) %>%
tidyr::pivot_wider(names_from = BMI.Category, values_from = n, values_fill = 0)
## # A tibble: 2 × 4
## protPerm_t `1` `2` `3`
## <int> <int> <int> <int>
## 1 0 33 33 34
## 2 1 35 37 28
df %>% count(BMI.Category) %>%
tidyr::pivot_wider(names_from = BMI.Category, values_from = n, values_fill = 0)
## # A tibble: 1 × 3
## `1` `2` `3`
## <int> <int> <int>
## 1 68 70 62
df %>% count(protPerm_t, Age.Category) %>%
tidyr::pivot_wider(names_from = Age.Category, values_from = n, values_fill = 0)
## # A tibble: 2 × 4
## protPerm_t `1` `2` `3`
## <int> <int> <int> <int>
## 1 0 44 30 26
## 2 1 27 40 33
df %>% count(protPerm_t, sex) %>%
tidyr::pivot_wider(names_from = sex, values_from = n, values_fill = 0)
## # A tibble: 2 × 3
## protPerm_t Female Male
## <int> <int> <int>
## 1 0 60 40
## 2 1 50 50
df %>% count(protPerm_t, StoolConsistencyClass) %>%
tidyr::pivot_wider(names_from = StoolConsistencyClass, values_from = n, values_fill = 0)
## # A tibble: 2 × 4
## protPerm_t hard normal soft
## <int> <int> <int> <int>
## 1 0 30 52 18
## 2 1 25 47 28
41 fecal waters do not have SCFA data, but I dont think this is something to worry about because the number of observations used to make the model is over 250
# List of variables in the desired order
var_order <- c(
"mean_IL8", "mean_percTEERdelta", "Age", "BMI",
"plasma_lbp_bd1", "CRP_BD1mgL", "st_wt", "bristol_num",
"fecal_calprotectin", "fecal_neopterin", "fecal_mpo_ugg",
"but_norm", "prop_norm", "acet_norm", "but", "prop", "acet", "isoval", "total_scfa", "fecal_ph",
"avg_total_kcal", "avg_total_fiber", "mg_fbr_p_kcal", "avg_total_PRO",
"hei_asa24_totalveg", "hei_asa24_greensandbeans", "hei_asa24_totalfruit", "hei_asa24_wholefruit",
"hei_asa24_wholegrain", "hei_asa24_totaldairy", "hei_asa24_totalproteinfoods",
"hei_asa24_seaplantproteins", "hei_asa24_fattyacids", "hei_asa24_refinedgrains",
"hei_asa24_sfat", "hei_asa24_addsug", "hei_asa24_totalscore"
)
# Make variable a factor with specified order
qq_data <- avg_fecal_meta_SCFA %>%
filter(!treatment %in% c("nctrl", "pctrl")) %>%
select(all_of(var_order)) %>%
pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
mutate(variable = factor(variable, levels = var_order))
qq_data %>%
group_by(variable) %>%
summarise(n_NA = sum(is.na(value))) %>% knitr::kable()
| variable | n_NA |
|---|---|
| mean_IL8 | 0 |
| mean_percTEERdelta | 0 |
| Age | 2 |
| BMI | 2 |
| plasma_lbp_bd1 | 2 |
| CRP_BD1mgL | 4 |
| st_wt | 18 |
| bristol_num | 18 |
| fecal_calprotectin | 2 |
| fecal_neopterin | 6 |
| fecal_mpo_ugg | 2 |
| but_norm | 41 |
| prop_norm | 41 |
| acet_norm | 41 |
| but | 41 |
| prop | 41 |
| acet | 41 |
| isoval | 41 |
| total_scfa | 41 |
| fecal_ph | 2 |
| avg_total_kcal | 3 |
| avg_total_fiber | 3 |
| mg_fbr_p_kcal | 3 |
| avg_total_PRO | 3 |
| hei_asa24_totalveg | 3 |
| hei_asa24_greensandbeans | 3 |
| hei_asa24_totalfruit | 3 |
| hei_asa24_wholefruit | 3 |
| hei_asa24_wholegrain | 3 |
| hei_asa24_totaldairy | 3 |
| hei_asa24_totalproteinfoods | 3 |
| hei_asa24_seaplantproteins | 3 |
| hei_asa24_fattyacids | 3 |
| hei_asa24_refinedgrains | 3 |
| hei_asa24_sfat | 3 |
| hei_asa24_addsug | 3 |
| hei_asa24_totalscore | 3 |
Most of the independent variables do not appear normally distributed according to the Q-Q plots.
Units:
# Q-Q plot of each variable
ggplot(qq_data, aes(sample = value)) +
stat_qq() +
stat_qq_line(color = "red") +
facet_wrap(~ variable, scales = "free", ncol = 4, nrow = 10) +
theme_minimal(base_size = 8) +
labs(title = "Q-Q Plots of Untransformed Variables") +
theme(
strip.text = element_text(size = 10),
plot.title = element_text(size = 14),
axis.text = element_text(size = 8)
)